// Copyright 2011 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef V8_STRING_SEARCH_H_
#define V8_STRING_SEARCH_H_

#include "src/isolate.h"
#include "src/vector.h"

namespace v8 {
namespace internal {

    //---------------------------------------------------------------------
    // String Search object.
    //---------------------------------------------------------------------

    // Class holding constants and methods that apply to all string search variants,
    // independently of subject and pattern char size.
    class StringSearchBase {
    protected:
        // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
        // limit, we can fix the size of tables. For a needle longer than this limit,
        // search will not be optimal, since we only build tables for a suffix
        // of the string, but it is a safe approximation.
        static const int kBMMaxShift = Isolate::kBMMaxShift;

        // Reduce alphabet to this size.
        // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
        // proportional to the input alphabet. We reduce the alphabet size by
        // equating input characters modulo a smaller alphabet size. This gives
        // a potentially less efficient searching, but is a safe approximation.
        // For needles using only characters in the same Unicode 256-code point page,
        // there is no search speed degradation.
        static const int kLatin1AlphabetSize = 256;
        static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;

        // Bad-char shift table stored in the state. It's length is the alphabet size.
        // For patterns below this length, the skip length of Boyer-Moore is too short
        // to compensate for the algorithmic overhead compared to simple brute force.
        static const int kBMMinPatternLength = 7;

        static inline bool IsOneByteString(Vector<const uint8_t> string)
        {
            return true;
        }

        static inline bool IsOneByteString(Vector<const uc16> string)
        {
            return String::IsOneByte(string.start(), string.length());
        }

        friend class Isolate;
    };

    template <typename PatternChar, typename SubjectChar>
    class StringSearch : private StringSearchBase {
    public:
        StringSearch(Isolate* isolate, Vector<const PatternChar> pattern)
            : isolate_(isolate)
            , pattern_(pattern)
            , start_(Max(0, pattern.length() - kBMMaxShift))
        {
            if (sizeof(PatternChar) > sizeof(SubjectChar)) {
                if (!IsOneByteString(pattern_)) {
                    strategy_ = &FailSearch;
                    return;
                }
            }
            int pattern_length = pattern_.length();
            if (pattern_length < kBMMinPatternLength) {
                if (pattern_length == 1) {
                    strategy_ = &SingleCharSearch;
                    return;
                }
                strategy_ = &LinearSearch;
                return;
            }
            strategy_ = &InitialSearch;
        }

        int Search(Vector<const SubjectChar> subject, int index)
        {
            return strategy_(this, subject, index);
        }

        static inline int AlphabetSize()
        {
            if (sizeof(PatternChar) == 1) {
                // Latin1 needle.
                return kLatin1AlphabetSize;
            } else {
                DCHECK_EQ(sizeof(PatternChar), 2);
                // UC16 needle.
                return kUC16AlphabetSize;
            }
        }

    private:
        typedef int (*SearchFunction)( // NOLINT - it's not a cast!
            StringSearch<PatternChar, SubjectChar>*,
            Vector<const SubjectChar>,
            int);

        static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
            Vector<const SubjectChar>,
            int)
        {
            return -1;
        }

        static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
            Vector<const SubjectChar> subject,
            int start_index);

        static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
            Vector<const SubjectChar> subject,
            int start_index);

        static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
            Vector<const SubjectChar> subject,
            int start_index);

        static int BoyerMooreHorspoolSearch(
            StringSearch<PatternChar, SubjectChar>* search,
            Vector<const SubjectChar> subject,
            int start_index);

        static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
            Vector<const SubjectChar> subject,
            int start_index);

        void PopulateBoyerMooreHorspoolTable();

        void PopulateBoyerMooreTable();

        static inline bool exceedsOneByte(uint8_t c)
        {
            return false;
        }

        static inline bool exceedsOneByte(uint16_t c)
        {
            return c > String::kMaxOneByteCharCodeU;
        }

        static inline int CharOccurrence(int* bad_char_occurrence,
            SubjectChar char_code)
        {
            if (sizeof(SubjectChar) == 1) {
                return bad_char_occurrence[static_cast<int>(char_code)];
            }
            if (sizeof(PatternChar) == 1) {
                if (exceedsOneByte(char_code)) {
                    return -1;
                }
                return bad_char_occurrence[static_cast<unsigned int>(char_code)];
            }
            // Both pattern and subject are UC16. Reduce character to equivalence class.
            int equiv_class = char_code % kUC16AlphabetSize;
            return bad_char_occurrence[equiv_class];
        }

        // The following tables are shared by all searches.
        // TODO(lrn): Introduce a way for a pattern to keep its tables
        // between searches (e.g., for an Atom RegExp).

        // Store for the BoyerMoore(Horspool) bad char shift table.
        // Return a table covering the last kBMMaxShift+1 positions of
        // pattern.
        int* bad_char_table()
        {
            return isolate_->bad_char_shift_table();
        }

        // Store for the BoyerMoore good suffix shift table.
        int* good_suffix_shift_table()
        {
            // Return biased pointer that maps the range  [start_..pattern_.length()
            // to the kGoodSuffixShiftTable array.
            return isolate_->good_suffix_shift_table() - start_;
        }

        // Table used temporarily while building the BoyerMoore good suffix
        // shift table.
        int* suffix_table()
        {
            // Return biased pointer that maps the range  [start_..pattern_.length()
            // to the kSuffixTable array.
            return isolate_->suffix_table() - start_;
        }

        Isolate* isolate_;
        // The pattern to search for.
        Vector<const PatternChar> pattern_;
        // Pointer to implementation of the search.
        SearchFunction strategy_;
        // Cache value of Max(0, pattern_length() - kBMMaxShift)
        int start_;
    };

    template <typename T, typename U>
    inline T AlignDown(T value, U alignment)
    {
        return reinterpret_cast<T>(
            (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
    }

    inline uint8_t GetHighestValueByte(uc16 character)
    {
        return Max(static_cast<uint8_t>(character & 0xFF),
            static_cast<uint8_t>(character >> 8));
    }

    inline uint8_t GetHighestValueByte(uint8_t character) { return character; }

    template <typename PatternChar, typename SubjectChar>
    inline int FindFirstCharacter(Vector<const PatternChar> pattern,
        Vector<const SubjectChar> subject, int index)
    {
        const PatternChar pattern_first_char = pattern[0];
        const int max_n = (subject.length() - pattern.length() + 1);

        const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
        const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
        int pos = index;
        do {
            DCHECK_GE(max_n - pos, 0);
            const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
                memchr(subject.start() + pos, search_byte,
                    (max_n - pos) * sizeof(SubjectChar)));
            if (char_pos == nullptr)
                return -1;
            char_pos = AlignDown(char_pos, sizeof(SubjectChar));
            pos = static_cast<int>(char_pos - subject.start());
            if (subject[pos] == search_char)
                return pos;
        } while (++pos < max_n);

        return -1;
    }

    //---------------------------------------------------------------------
    // Single Character Pattern Search Strategy
    //---------------------------------------------------------------------

    template <typename PatternChar, typename SubjectChar>
    int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
        StringSearch<PatternChar, SubjectChar>* search,
        Vector<const SubjectChar> subject,
        int index)
    {
        DCHECK_EQ(1, search->pattern_.length());
        PatternChar pattern_first_char = search->pattern_[0];
        if (sizeof(PatternChar) > sizeof(SubjectChar)) {
            if (exceedsOneByte(pattern_first_char)) {
                return -1;
            }
        }
        return FindFirstCharacter(search->pattern_, subject, index);
    }

    //---------------------------------------------------------------------
    // Linear Search Strategy
    //---------------------------------------------------------------------

    template <typename PatternChar, typename SubjectChar>
    inline bool CharCompare(const PatternChar* pattern,
        const SubjectChar* subject,
        int length)
    {
        DCHECK_GT(length, 0);
        int pos = 0;
        do {
            if (pattern[pos] != subject[pos]) {
                return false;
            }
            pos++;
        } while (pos < length);
        return true;
    }

    // Simple linear search for short patterns. Never bails out.
    template <typename PatternChar, typename SubjectChar>
    int StringSearch<PatternChar, SubjectChar>::LinearSearch(
        StringSearch<PatternChar, SubjectChar>* search,
        Vector<const SubjectChar> subject,
        int index)
    {
        Vector<const PatternChar> pattern = search->pattern_;
        DCHECK_GT(pattern.length(), 1);
        int pattern_length = pattern.length();
        int i = index;
        int n = subject.length() - pattern_length;
        while (i <= n) {
            i = FindFirstCharacter(pattern, subject, i);
            if (i == -1)
                return -1;
            DCHECK_LE(i, n);
            i++;
            // Loop extracted to separate function to allow using return to do
            // a deeper break.
            if (CharCompare(pattern.start() + 1,
                    subject.start() + i,
                    pattern_length - 1)) {
                return i - 1;
            }
        }
        return -1;
    }

    //---------------------------------------------------------------------
    // Boyer-Moore string search
    //---------------------------------------------------------------------

    template <typename PatternChar, typename SubjectChar>
    int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
        StringSearch<PatternChar, SubjectChar>* search,
        Vector<const SubjectChar> subject,
        int start_index)
    {
        Vector<const PatternChar> pattern = search->pattern_;
        int subject_length = subject.length();
        int pattern_length = pattern.length();
        // Only preprocess at most kBMMaxShift last characters of pattern.
        int start = search->start_;

        int* bad_char_occurence = search->bad_char_table();
        int* good_suffix_shift = search->good_suffix_shift_table();

        PatternChar last_char = pattern[pattern_length - 1];
        int index = start_index;
        // Continue search from i.
        while (index <= subject_length - pattern_length) {
            int j = pattern_length - 1;
            int c;
            while (last_char != (c = subject[index + j])) {
                int shift = j - CharOccurrence(bad_char_occurence, c);
                index += shift;
                if (index > subject_length - pattern_length) {
                    return -1;
                }
            }
            while (j >= 0 && pattern[j] == (c = subject[index + j]))
                j--;
            if (j < 0) {
                return index;
            } else if (j < start) {
                // we have matched more than our tables allow us to be smart about.
                // Fall back on BMH shift.
                index += pattern_length - 1
                    - CharOccurrence(bad_char_occurence,
                        static_cast<SubjectChar>(last_char));
            } else {
                int gs_shift = good_suffix_shift[j + 1];
                int bc_occ = CharOccurrence(bad_char_occurence, c);
                int shift = j - bc_occ;
                if (gs_shift > shift) {
                    shift = gs_shift;
                }
                index += shift;
            }
        }

        return -1;
    }

    template <typename PatternChar, typename SubjectChar>
    void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable()
    {
        int pattern_length = pattern_.length();
        const PatternChar* pattern = pattern_.start();
        // Only look at the last kBMMaxShift characters of pattern (from start_
        // to pattern_length).
        int start = start_;
        int length = pattern_length - start;

        // Biased tables so that we can use pattern indices as table indices,
        // even if we only cover the part of the pattern from offset start.
        int* shift_table = good_suffix_shift_table();
        int* suffix_table = this->suffix_table();

        // Initialize table.
        for (int i = start; i < pattern_length; i++) {
            shift_table[i] = length;
        }
        shift_table[pattern_length] = 1;
        suffix_table[pattern_length] = pattern_length + 1;

        if (pattern_length <= start) {
            return;
        }

        // Find suffixes.
        PatternChar last_char = pattern[pattern_length - 1];
        int suffix = pattern_length + 1;
        {
            int i = pattern_length;
            while (i > start) {
                PatternChar c = pattern[i - 1];
                while (suffix <= pattern_length && c != pattern[suffix - 1]) {
                    if (shift_table[suffix] == length) {
                        shift_table[suffix] = suffix - i;
                    }
                    suffix = suffix_table[suffix];
                }
                suffix_table[--i] = --suffix;
                if (suffix == pattern_length) {
                    // No suffix to extend, so we check against last_char only.
                    while ((i > start) && (pattern[i - 1] != last_char)) {
                        if (shift_table[pattern_length] == length) {
                            shift_table[pattern_length] = pattern_length - i;
                        }
                        suffix_table[--i] = pattern_length;
                    }
                    if (i > start) {
                        suffix_table[--i] = --suffix;
                    }
                }
            }
        }
        // Build shift table using suffixes.
        if (suffix < pattern_length) {
            for (int i = start; i <= pattern_length; i++) {
                if (shift_table[i] == length) {
                    shift_table[i] = suffix - start;
                }
                if (i == suffix) {
                    suffix = suffix_table[suffix];
                }
            }
        }
    }

    //---------------------------------------------------------------------
    // Boyer-Moore-Horspool string search.
    //---------------------------------------------------------------------

    template <typename PatternChar, typename SubjectChar>
    int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
        StringSearch<PatternChar, SubjectChar>* search,
        Vector<const SubjectChar> subject,
        int start_index)
    {
        Vector<const PatternChar> pattern = search->pattern_;
        int subject_length = subject.length();
        int pattern_length = pattern.length();
        int* char_occurrences = search->bad_char_table();
        int badness = -pattern_length;

        // How bad we are doing without a good-suffix table.
        PatternChar last_char = pattern[pattern_length - 1];
        int last_char_shift = pattern_length - 1 - CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
        // Perform search
        int index = start_index; // No matches found prior to this index.
        while (index <= subject_length - pattern_length) {
            int j = pattern_length - 1;
            int subject_char;
            while (last_char != (subject_char = subject[index + j])) {
                int bc_occ = CharOccurrence(char_occurrences, subject_char);
                int shift = j - bc_occ;
                index += shift;
                badness += 1 - shift; // at most zero, so badness cannot increase.
                if (index > subject_length - pattern_length) {
                    return -1;
                }
            }
            j--;
            while (j >= 0 && pattern[j] == (subject[index + j]))
                j--;
            if (j < 0) {
                return index;
            } else {
                index += last_char_shift;
                // Badness increases by the number of characters we have
                // checked, and decreases by the number of characters we
                // can skip by shifting. It's a measure of how we are doing
                // compared to reading each character exactly once.
                badness += (pattern_length - j) - last_char_shift;
                if (badness > 0) {
                    search->PopulateBoyerMooreTable();
                    search->strategy_ = &BoyerMooreSearch;
                    return BoyerMooreSearch(search, subject, index);
                }
            }
        }
        return -1;
    }

    template <typename PatternChar, typename SubjectChar>
    void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable()
    {
        int pattern_length = pattern_.length();

        int* bad_char_occurrence = bad_char_table();

        // Only preprocess at most kBMMaxShift last characters of pattern.
        int start = start_;
        // Run forwards to populate bad_char_table, so that *last* instance
        // of character equivalence class is the one registered.
        // Notice: Doesn't include the last character.
        int table_size = AlphabetSize();
        if (start == 0) { // All patterns less than kBMMaxShift in length.
            memset(bad_char_occurrence,
                -1,
                table_size * sizeof(*bad_char_occurrence));
        } else {
            for (int i = 0; i < table_size; i++) {
                bad_char_occurrence[i] = start - 1;
            }
        }
        for (int i = start; i < pattern_length - 1; i++) {
            PatternChar c = pattern_[i];
            int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
            bad_char_occurrence[bucket] = i;
        }
    }

    //---------------------------------------------------------------------
    // Linear string search with bailout to BMH.
    //---------------------------------------------------------------------

    // Simple linear search for short patterns, which bails out if the string
    // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
    template <typename PatternChar, typename SubjectChar>
    int StringSearch<PatternChar, SubjectChar>::InitialSearch(
        StringSearch<PatternChar, SubjectChar>* search,
        Vector<const SubjectChar> subject,
        int index)
    {
        Vector<const PatternChar> pattern = search->pattern_;
        int pattern_length = pattern.length();
        // Badness is a count of how much work we have done.  When we have
        // done enough work we decide it's probably worth switching to a better
        // algorithm.
        int badness = -10 - (pattern_length << 2);

        // We know our pattern is at least 2 characters, we cache the first so
        // the common case of the first character not matching is faster.
        for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
            badness++;
            if (badness <= 0) {
                i = FindFirstCharacter(pattern, subject, i);
                if (i == -1)
                    return -1;
                DCHECK_LE(i, n);
                int j = 1;
                do {
                    if (pattern[j] != subject[i + j]) {
                        break;
                    }
                    j++;
                } while (j < pattern_length);
                if (j == pattern_length) {
                    return i;
                }
                badness += j;
            } else {
                search->PopulateBoyerMooreHorspoolTable();
                search->strategy_ = &BoyerMooreHorspoolSearch;
                return BoyerMooreHorspoolSearch(search, subject, i);
            }
        }
        return -1;
    }

    // Perform a a single stand-alone search.
    // If searching multiple times for the same pattern, a search
    // object should be constructed once and the Search function then called
    // for each search.
    template <typename SubjectChar, typename PatternChar>
    int SearchString(Isolate* isolate,
        Vector<const SubjectChar> subject,
        Vector<const PatternChar> pattern,
        int start_index)
    {
        StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
        return search.Search(subject, start_index);
    }

    // A wrapper function around SearchString that wraps raw pointers to the subject
    // and pattern as vectors before calling SearchString. Used from the
    // StringIndexOf builtin.
    template <typename SubjectChar, typename PatternChar>
    intptr_t SearchStringRaw(Isolate* isolate, const SubjectChar* subject_ptr,
        int subject_length, const PatternChar* pattern_ptr,
        int pattern_length, int start_index)
    {
        DisallowHeapAllocation no_gc;
        Vector<const SubjectChar> subject(subject_ptr, subject_length);
        Vector<const PatternChar> pattern(pattern_ptr, pattern_length);
        return SearchString(isolate, subject, pattern, start_index);
    }

} // namespace internal
} // namespace v8

#endif // V8_STRING_SEARCH_H_
